This ipython file is the project by Hongyi Tang and Weijian Li for course 12752. There are four ipython files in the project in total. Each file consist of one cluster analysis task. In this file, the cluster analysis is demonstrated to 5 building types.


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

%matplotlib inline

Please download the dataset and change the file path.


In [2]:
# Read in CBECS data
f = open('C:/F16-12-752-master/projects/thongyi_weijian1/data/CBECS.csv')
data = pd.read_csv(f,sep=',', header='infer', parse_dates=[1])
#data = pd.DataFrame.from_csv('../../lectures/data/CBECS.csv') 
data = data.set_index('PUBID')
data.tail()


Out[2]:
REGION CENDIV PBA FREESTN SQFT SQFTC WLCNS RFCNS RFCOOL RFTILT ... FKCLBTU FKWTBTU FKCKBTU FKOTBTU DHHTBTU DHCLBTU DHWTBTU DHCKBTU DHOTBTU PUBCLIM
PUBID
6716 3 5 14 1.0 108000 7 1 6 2 1 ... 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN 2
6717 3 7 5 1.0 1700 2 5 5 2 2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2
6718 2 3 26 1.0 2000 2 1 4 2 2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
6719 1 2 12 1.0 19250 4 1 4 2 1 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2
6720 3 5 14 1.0 142000 7 1 1 1 2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2

5 rows × 1118 columns

Office, inpatient health care, service, public assembly and education buildings are selected.


In [74]:
energydata=pd.DataFrame()

type_B=[2,16,26,13,14] # office, inpatient health care, service, public assembly and education
type_C=[1,3,4,5,6,7,8,9,10,11,12,15,17,18,19,20,21,22,23,24,25,91]

data_type=data
data_type=data_type[data_type.NGUSED!=2]

for i in type_C:
    data_type=data_type[data_type.PBA!=i]

energydata['Building Type']=data_type.PBA

index=['ELBTU','NGBTU','ELVNBTU','NGHTBTU']
for i in index:
    energydata[i]=data_type[i]/data_type.SQFT
energydata


Out[74]:
Building Type ELBTU NGBTU ELVNBTU NGHTBTU
PUBID
7 16 111.653990 51.838735 23.739538 0.000000
9 2 59.424400 19.931467 17.621520 16.893707
10 2 91.791085 0.222767 21.699827 0.000000
11 2 42.062146 20.280000 18.361463 19.146244
13 26 34.994000 6.896857 2.202000 6.896857
15 14 62.133773 17.110851 4.380262 9.513459
16 14 44.434808 35.471904 4.091010 25.715029
18 2 10.216667 8.183000 2.971167 8.183000
20 2 37.380282 22.818521 14.557141 21.278775
23 13 22.276267 9.973267 3.859567 9.730733
24 16 84.670228 143.382582 14.037165 67.848323
30 14 23.974024 34.225133 2.972892 30.004072
31 16 52.721559 0.449975 12.871851 0.000000
35 14 20.924827 38.079383 2.216235 32.082160
36 2 30.185778 8.474478 13.816567 7.493933
37 13 25.851400 23.329000 0.672800 22.582200
47 16 112.065507 36.251796 32.019704 0.000000
51 2 41.638982 29.714506 14.336681 20.583518
54 16 116.452243 129.202617 19.629003 7.006450
56 14 23.226000 50.701273 1.994788 17.707616
64 13 61.291410 153.400525 9.889967 152.621377
67 14 17.523857 0.834667 1.261571 0.603048
69 2 111.319939 3.025333 33.627515 0.912000
70 13 45.290962 2.109231 1.837308 0.069231
74 16 138.391714 97.729357 17.607966 48.233034
76 2 76.080885 2.363548 25.345914 0.000000
81 14 42.551833 68.709167 4.299833 58.528542
85 26 1.971039 29.858182 0.175065 25.146883
92 14 24.616567 6.623077 4.013740 6.623077
95 13 39.543274 1.249421 2.296225 0.700909
... ... ... ... ... ...
6611 2 56.035500 38.796500 11.106500 31.261500
6617 16 119.570520 93.806463 18.372153 67.871923
6618 16 93.863391 122.775413 13.017591 97.630818
6619 16 95.567550 0.193555 15.845235 0.000000
6624 13 21.094286 109.480000 2.070000 105.707619
6629 2 186.984667 20.875852 36.775889 19.644852
6637 16 178.824627 37.098507 7.642415 0.000000
6643 14 55.829700 414.871325 2.651900 316.689950
6648 14 26.283905 44.791524 2.631619 35.409929
6651 14 35.292222 18.194889 2.715422 18.194889
6656 14 34.797421 54.591825 7.462667 29.633421
6659 2 23.011429 42.522857 2.783429 42.522857
6662 14 70.270060 1.818155 12.736560 1.027964
6675 16 52.161962 143.888599 9.273709 43.307132
6681 14 41.022364 46.330873 5.011568 39.805356
6683 2 47.802000 52.958667 9.151333 52.958667
6688 14 34.410140 42.106047 2.576000 31.000186
6692 14 61.797682 40.177209 4.475609 40.177209
6693 14 50.435936 56.027374 7.290974 44.929596
6694 26 17.754971 30.111571 3.015971 30.111571
6695 16 136.860003 0.271830 24.284254 0.000000
6696 14 41.743009 46.501628 7.560744 31.320228
6700 16 76.598659 113.545840 13.078797 80.291207
6709 14 18.228881 99.498966 3.815614 89.770335
6710 2 64.932231 7.261769 16.085308 7.261769
6712 13 10.228545 12.710000 0.397364 12.526364
6713 14 51.918501 0.315094 13.446402 0.000000
6716 14 42.192602 27.452917 4.503444 22.808935
6718 26 104.144500 135.966500 4.339500 127.668000
6720 14 23.122232 23.920761 2.676141 17.144408

2259 rows × 5 columns


In [75]:
energydata = energydata.dropna(how='any')
energydata = energydata[~(energydata == 0).any(axis=1)]
PBA1=energydata['Building Type'].unique()
PBA1
count=[]
for i in PBA1:
    count.append([energydata[energydata['Building Type']==i].shape[0],i])
count


Out[75]:
[[714, 2], [244, 26], [471, 14], [226, 13], [283, 16]]

In [91]:
type_C=['Office','Inpatient health care','Service','Public assembly','Education']
fig1 = plt.figure(figsize=(20,15))
times=1
data_seperate=[]
# energydata[energydata['Building Type']==type_B[1]]
for i in range(len(type_B)):
    x=energydata[energydata['Building Type']==type_B[i]]
    x=x.drop(x.columns[0],axis=1)
    data_seperate.append(x)
for i in range(len(type_B)):
    plt.subplot(len(type_B),2,times)
    data_seperate[i].boxplot()
    times=times+1
    plt.title(type_C[i])
    plt.ylim(0,200)


C:\Users\Daniel\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: FutureWarning: 
The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.

In [80]:
y=pd.DataFrame()
for i in range(len(type_B)):
    y=y.append(data_seperate[i])
X=y.as_matrix().astype(np.float32)
from sklearn.cluster import KMeans
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
    plt.subplot(num_clust,2,cluster_id+1)
    cluster_members = X[cluster_assignments==cluster_id,:]
    print(len(cluster_members))
    for i in range(len(cluster_members)):
        plt.plot(cluster_members[i,:], color='grey', lw='0.1')
    plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')


1257
168
198
302
13

In [81]:
# plot stem figure to see the partten and outlier
y['assignment']=cluster_assignments
y=y[y.assignment!=4]
del y['assignment']
X=y.as_matrix().astype(np.float32)

In [82]:
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
    plt.subplot(num_clust+1,2,cluster_id+1)
    cluster_members = X[cluster_assignments==cluster_id,:]
    print(len(cluster_members))
    for i in range(len(cluster_members)):
        plt.plot(cluster_members[i,:], color='grey', lw='0.1')
    plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')


1118
208
372
192
35

In [83]:
# plot stem figure to see the partten and outlier
y['assignment']=cluster_assignments
y=y[y.assignment!=4]
del y['assignment']
X=y.as_matrix().astype(np.float32)

In [84]:
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
    plt.subplot(num_clust+1,2,cluster_id+1)
    cluster_members = X[cluster_assignments==cluster_id,:]
    print(len(cluster_members))
    for i in range(len(cluster_members)):
        plt.plot(cluster_members[i,:], color='grey', lw='0.1')
    plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')


1084
204
41
214
347

In [85]:
# plot stem figure to see the partten and outlier
y['assignment']=cluster_assignments
y=y[y.assignment!=2]
del y['assignment']
X=y.as_matrix().astype(np.float32)

In [86]:
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
    plt.subplot(num_clust+1,2,cluster_id+1)
    cluster_members = X[cluster_assignments==cluster_id,:]
    print(len(cluster_members))
    for i in range(len(cluster_members)):
        plt.plot(cluster_members[i,:], color='grey', lw='0.1')
    plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')


854
165
477
140
213

The connection between assignment and building type need to be found by visually judging. The connecting can be different every time the file was run.


In [87]:
y['assignment']=cluster_assignments
y=y.join(data['PBA'],how='inner')
y
y['judge']=1
y['judge'].iloc[np.where(np.array(y.PBA)==2)]=0
y['judge'].iloc[np.where(np.array(y.PBA)==16)]=3
y['judge'].iloc[np.where(np.array(y.PBA)==26)]=1
y['judge'].iloc[np.where(np.array(y.PBA)==13)]=4
y['judge'].iloc[np.where(np.array(y.PBA)==14)]=2
y[y['judge']==y['assignment']].count()


C:\Users\Daniel\Anaconda3\lib\site-packages\pandas\core\indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
Out[87]:
ELBTU         727
NGBTU         727
ELVNBTU       727
NGHTBTU       727
assignment    727
PBA           727
judge         727
dtype: int64

In [88]:
a=727/(714+283+244+226+471)

In [89]:
a


Out[89]:
0.3751289989680083

In [ ]: